from pypatnlp import *

# convert hinnavaatlus corpus
parse_plain_corpus('hinnavaatlus.txt', 'plain_hinnavaatlus.pycorp')
as_t3corpus('plain_hinnavaatlus.pycorp', 'hinnavaatlus.pycorp')

# prepare feature extractor
pycorp = PyCorpus('hinnavaatlus.pycorp', readonly=True)
exs = get_local_extractors(['lemma', 'wtype'], 0)
fe = CorpusFeatureExtractor(DocumentFeatureExtractor(*exs, nooffset=True))

# create c++ hinnavaatlus
corp = Corpus(fe.transform(pycorp))
write_corpus_to_file('hinnavaatlus.corp', corp)

# use same feature extractor to create background corpus from ner dataset
ner_pycorp = PyCorpus('../../data/estner.pycorp')
ner_corp = Corpus(fe.transform(ner_pycorp))
write_corpus_to_file('estner.corp', ner_corp)

